sightings = read_csv('data/NYC_Rat_Sightings.csv') |>
janitor::clean_names() |>
separate(created_date, into=c("month","e", "day","f", "year", "g", "time"), sep=c(2,3,5,6,10,11)) |>
select(-e,-f,-g) |>
mutate(date = paste(year, month, day, sep=""),
date = as.numeric(date)) |>
filter(date <= 20231031,
date >= 20160101,
!incident_zip <= 10000,
!incident_zip > 11697,
!borough %in% c("Unspecified", NA)) |>
select(-agency, -agency_name, -complaint_type, -descriptor, -landmark, -facility_type, -park_facility_name, -vehicle_type, -taxi_company_borough, -taxi_pick_up_location, -bridge_highway_name, -road_ramp, -bridge_highway_segment, -bridge_highway_direction) |> select(unique_key, date, year, month, day, everything())
## Warning: One or more parsing issues, call `problems()` on your data
## frame for details, e.g.:
## dat <- vroom(...)
## problems(dat)
## Rows: 232143 Columns: 38
## ── Column specification ─────────────────────────────────────
## Delimiter: ","
## chr (25): Created Date, Closed Date, Agency, Agency Name, Complaint Type, De...
## dbl (6): Unique Key, Incident Zip, X Coordinate (State Plane), Y Coordinate...
## lgl (7): Vehicle Type, Taxi Company Borough, Taxi Pick Up Location, Bridge ...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
story: rat sightings have gone up drastically since 2020, when their populations were able to thrive as people were quarantined inside. based on rat sighting data, what is the scope of the NYC rats problem?
# total count per year - sharp increase after 2020, peak 2022
sightings |> group_by(year) |>
summarize(count=n()) |>
plot_ly(x=~year, y=~count, type="scatter", mode="line")
# total count per year by borough - most in brooklyn and manhattan
sightings |>
group_by(year, borough) |>
summarize(count=n()) |>
plot_ly(x=~year, y=~count, color=~borough, type="scatter", mode="line")
## `summarise()` has grouped output by 'year'. You can override
## using the `.groups` argument.
# trends by month of the year - bulk between april/may to october/november
sightings |>
group_by(year,month) |>
summarize(count = n()) |>
ggplot(aes(x=month, y=count, color=year, group=year)) + geom_line()
## `summarise()` has grouped output by 'year'. You can override
## using the `.groups` argument.

# 2023 rat sightings - deepness of color shows density of counts: not a lot in SI, west side of bk, UWS/CPwest, some in upper MH, spread out in bronx
sightings |>
filter(year=="2023") |>
plot_ly(x=~longitude, y=~latitude, type="scatter", mode="markers", marker=list(color="darkblue",size=1, opacity=0.5))
## Warning: Ignoring 138 observations
# all rat sightings
sightings |>
plot_ly(x=~longitude, y=~latitude, type="scatter", mode="markers", marker=list(size=1, opacity=0.05))
## Warning: Ignoring 1635 observations
a rat czar was appointed in april 2023. what have the number of rat sightings looked like in the 6 months before vs after she was crowned?
# looking at rat czar effects
rat_czar = sightings |>
filter(date >= 20221001) |>
group_by(year, month) |>
summarize(count=n()) |>
mutate(month_year = paste(month, year, sep="_"),
month_year = as.factor(month_year))
## `summarise()` has grouped output by 'year'. You can override
## using the `.groups` argument.
# graph of counts per month, 6 months before and after coronation
rat_czar |>
ggplot(aes(x=month_year, y=count)) + geom_point() + geom_vline(xintercept="04_2023", color="red", linetype="dashed") + theme(axis.text.x=element_text(angle=90))

# linear regression before and after
# 6 months before
czar_before = rat_czar |> filter(month_year %in% c("10_2022","11_2022", "12_2022", "01_2023", "02_2023", "03_2023", "04_2023")) |>
mutate(months = case_when(month_year == "10_2022" ~ 1,
month_year =="11_2022" ~ 2,
month_year == "12_2022" ~ 3,
month_year == "01_2023" ~ 4,
month_year == "02_2023" ~ 5,
month_year == "03_2023" ~ 6,
month_year == "04_2023" ~ 7))
czar_before |> ggplot(aes(x=months, y=count)) + geom_point() + geom_line()

czar_before |>
lm(count~months, data=_) |> broom::tidy() |> knitr::kable()
| term | estimate | std.error | statistic | p.value |
|---|---|---|---|---|
| (Intercept) | 1581.14286 | 240.0218 | 6.587498 | 0.0012107 |
| months | 58.39286 | 53.6705 | 1.087988 | 0.3262413 |
# slope is increasing but NOT significant (y= 1581.14 + 58.39x)
czar_after = rat_czar |> filter(month_year %in% c("04_2023","05_2023","06_2023", "07_2023", "08_2023", "09_2023", "10_2023")) |>
mutate(months = case_when(month_year == "04_2023" ~ 1,
month_year == "05_2023" ~ 2,
month_year =="06_2023" ~ 3,
month_year == "07_2023" ~ 4,
month_year == "08_2023" ~ 5,
month_year == "09_2023" ~ 6,
month_year == "10_2023" ~ 7))
czar_after |> ggplot(aes(x=months, y=count)) + geom_point() + geom_line()

czar_after |>
lm(count~months, data=_) |> broom::tidy() |> knitr::kable()
| term | estimate | std.error | statistic | p.value |
|---|---|---|---|---|
| (Intercept) | 2441.285714 | 279.08714 | 8.7473959 | 0.0003236 |
| months | -5.714286 | 62.40578 | -0.0915666 | 0.9305977 |
# slope is decreasing but also not significant ( y= 2441.29 - 5.71x)
This data doesn’t look very promising, as it seems like there are probably a lot of other factors that are associated with rat sightings. So what does the rat czar need to do to tackle the NYC rat problem?